Set up

set.seed(2021)

library(MouseGastrulationData)
library(ggplot2)
library(reshape)
library(org.Mm.eg.db)
library(scater)
library(patchwork)
library(gtools)

Load functions

source("../scripts/StabMap_functions.R")

Load E8.5 data using MouseGastrulationData

mt = MouseGastrulationData::AtlasSampleMetadata
samples = mt$sample[mt$stage %in% c("E8.5")]
SCE <- EmbryoAtlasData(samples = samples)
SCE
## class: SingleCellExperiment 
## dim: 29452 20978 
## metadata(0):
## assays(1): counts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(20978): cell_36865 cell_36866 ... cell_139330 cell_139331
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
SCE <- logNormCounts(SCE)
celltype_colours = MouseGastrulationData::EmbryoCelltypeColours
celltype_colours
##                       Epiblast               Primitive Streak 
##                      "#635547"                      "#DABE99" 
##                Caudal epiblast                            PGC 
##                      "#9E6762"                      "#FACB12" 
##      Anterior Primitive Streak                      Notochord 
##                      "#C19F70"                      "#0F4A9C" 
##                  Def. endoderm                            Gut 
##                      "#F397C0"                      "#EF5A9D" 
##               Nascent mesoderm                 Mixed mesoderm 
##                      "#C594BF"                      "#DFCDE4" 
##          Intermediate mesoderm                Caudal Mesoderm 
##                      "#139992"                      "#3F84AA" 
##              Paraxial mesoderm               Somitic mesoderm 
##                      "#8DB5CE"                      "#005579" 
##            Pharyngeal mesoderm                 Cardiomyocytes 
##                      "#C9EBFB"                      "#B51D8D" 
##                      Allantois                   ExE mesoderm 
##                      "#532C8A"                      "#8870AD" 
##                     Mesenchyme Haematoendothelial progenitors 
##                      "#CC7818"                      "#FBBE92" 
##                    Endothelium            Blood progenitors 1 
##                      "#FF891C"                      "#F9DECF" 
##            Blood progenitors 2                     Erythroid1 
##                      "#C9A997"                      "#C72228" 
##                     Erythroid2                     Erythroid3 
##                      "#F79083"                      "#EF4E22" 
##                            NMP           Rostral neurectoderm 
##                      "#8EC792"                      "#65A83E" 
##            Caudal neurectoderm                   Neural crest 
##                      "#354E23"                      "#C3C388" 
##   Forebrain/Midbrain/Hindbrain                    Spinal cord 
##                      "#647A4F"                      "#CDE088" 
##               Surface ectoderm              Visceral endoderm 
##                      "#F7F79E"                      "#F6BFCB" 
##                   ExE endoderm                   ExE ectoderm 
##                      "#7F6874"                      "#989898" 
##              Parietal endoderm 
##                      "#1A1A1A"

Randomly subset 5000 cells from this data, for speed reasons

cells_thin = sample(colnames(SCE), 5000)
SCE_thin = SCE[,cells_thin]
SCE_thin
## class: SingleCellExperiment 
## dim: 29452 5000 
## metadata(0):
## assays(2): counts logcounts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(5000): cell_136951 cell_132095 ... cell_39971 cell_96348
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):

Get gene names for SCMER and the PCA method used by Alsu

geneList_SCMER = sapply(mixedsort(list.files("../data/SCMER/", pattern = "nGenes", full.names = TRUE)), 
                        scan, what = "character", simplify = FALSE)
lapply(geneList_SCMER, function(x) all(x %in% rownames(SCE)))
## $`../data/SCMER//nGenes_18.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_32.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_33.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_50.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_53.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_75.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_117.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_137.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_153.txt`
## [1] TRUE
## 
## $`../data/SCMER//nGenes_257.txt`
## [1] TRUE
geneList_PC_ranked_symbols = read.csv("../data/SCMER/selection_spearman.csv", header = TRUE, row.names = 1)[,3]
geneList_PC_ranked = rowData(SCE)$ENSEMBL[match(geneList_PC_ranked_symbols, rowData(SCE)$SYMBOL)]
geneList_PC = lapply(geneList_SCMER, function(x) geneList_PC_ranked[1:length(x)])

Generate similarity for all genes

This will be re-used multiple times. Note this is a dense matrix, as output from igraph::similarity.

full_sim = generateSimilarity(SCE_thin, batchFactor = SCE_thin$sample)
dim(full_sim)
## [1] 5000 5000
full_sim[1:5,1:5]
##             cell_136951 cell_132095 cell_100178 cell_131549 cell_100447
## cell_136951  1.00000000           0 0.000000000  0.05479452 0.000000000
## cell_132095  0.00000000           1 0.000000000  0.00000000 0.000000000
## cell_100178  0.00000000           0 1.000000000  0.00000000 0.004761905
## cell_131549  0.05479452           0 0.000000000  1.00000000 0.000000000
## cell_100447  0.00000000           0 0.004761905  0.00000000 1.000000000

Calculate uncertainty scores for SCMER and PC selected genes

plotAdditional = list("celltype", 
                      list(scale_colour_manual(values = celltype_colours),
                           theme(legend.position = "bottom")))
scores_SCMER = lapply(geneList_SCMER, function(geneList) {
  getSubsetUncertainty(SCE_thin,
                       subsetGenes = geneList,
                       full_sim = full_sim,
                       plot = TRUE,
                       plotAdditional = plotAdditional)
})

scores_PC = lapply(geneList_PC, function(geneList) {
  getSubsetUncertainty(SCE_thin,
                       subsetGenes = geneList,
                       full_sim = full_sim,
                       plot = TRUE,
                       plotAdditional = plotAdditional)
})

Some exploration of these scores, for one subset choice

scores_df = data.frame(cell = colnames(SCE_thin),
                       score = scores_SCMER[[10]],
                       celltype = SCE_thin$celltype)
ggplot(scores_df, aes(x = celltype, y = score)) + 
  geom_boxplot(aes(fill = celltype)) +
  theme_classic() +
  scale_fill_manual(values = celltype_colours) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("") +
  theme(legend.position = "bottom") +
  NULL

Estimate scores for the entire dataset

I can also estimate the uncertainty for the full data, given the subset. This means that I can estimate the uncertainty for all cells, but only pull from the 5000x5000 similarity distribution from the random subset of cells. This is of course an estimate, but it’s worthwhile knowing one can circumvent needing to calculate the similarity across all cells.

joint_sample = SCE$sample
names(joint_sample) <- colnames(SCE)

scores_all_SCMER = lapply(geneList_SCMER, function(geneList) {
  getSubsetUncertainty(SCE_thin,
                       querySCE = SCE[, setdiff(colnames(SCE), colnames(SCE_thin))],
                       subsetGenes = geneList,
                       full_sim = full_sim,
                       plot = TRUE,
                       plotAdditional = plotAdditional,
                       jointBatchFactor = joint_sample)
})

scores_all_PC = lapply(geneList_PC, function(geneList) {
  print(length(geneList))
  getSubsetUncertainty(SCE_thin,
                       querySCE = SCE[, setdiff(colnames(SCE), colnames(SCE_thin))],
                       subsetGenes = geneList,
                       full_sim = full_sim,
                       plot = TRUE,
                       plotAdditional = plotAdditional,
                       jointBatchFactor = joint_sample)
})
## [1] 18

## [1] 32

## [1] 33

## [1] 50

## [1] 53

## [1] 75

## [1] 117

## [1] 137

## [1] 153

## [1] 257

Examine these, first by combining into a single data frame

scores_SCMER_df = do.call(rbind, sapply(names(scores_SCMER), function(score_name) {
  scores = scores_all_SCMER[[score_name]]
  data.frame(cell = names(scores),
             score = scores,
             celltype = SCE[,names(scores)]$celltype,
             sample = SCE[,names(scores)]$sample,
             score_n = score_name,
             type = "SCMER")
}, simplify = FALSE))

scores_PC_df = do.call(rbind, sapply(names(scores_PC), function(score_name) {
  scores = scores_all_PC[[score_name]]
  data.frame(cell = names(scores),
             score = scores,
             celltype = SCE[,names(scores)]$celltype,
             sample = SCE[,names(scores)]$sample,
             score_n = score_name,
             type = "PC")
}, simplify = FALSE))

scores_all_df = rbind(scores_SCMER_df, scores_PC_df)
scores_all_df$score_n <- factor(scores_all_df$score_n, levels = mixedsort(unique(scores_all_df$score_n)))

dim(scores_all_df)
## [1] 419560      6
head(scores_all_df)
##                                                 cell      score
## ../data/SCMER//nGenes_18.txt.cell_136951 cell_136951 0.09469388
## ../data/SCMER//nGenes_18.txt.cell_132095 cell_132095 0.31591837
## ../data/SCMER//nGenes_18.txt.cell_100178 cell_100178 0.38285714
## ../data/SCMER//nGenes_18.txt.cell_131549 cell_131549 0.61551020
## ../data/SCMER//nGenes_18.txt.cell_100447 cell_100447 0.40897959
## ../data/SCMER//nGenes_18.txt.cell_135831 cell_135831 0.20653061
##                                                     celltype sample
## ../data/SCMER//nGenes_18.txt.cell_136951          Erythroid2     37
## ../data/SCMER//nGenes_18.txt.cell_132095          Erythroid3     36
## ../data/SCMER//nGenes_18.txt.cell_100178   Paraxial mesoderm     29
## ../data/SCMER//nGenes_18.txt.cell_131549          Erythroid3     36
## ../data/SCMER//nGenes_18.txt.cell_100447 Pharyngeal mesoderm     29
## ../data/SCMER//nGenes_18.txt.cell_135831          Erythroid3     37
##                                                               score_n  type
## ../data/SCMER//nGenes_18.txt.cell_136951 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_132095 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_100178 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_131549 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_100447 ../data/SCMER//nGenes_18.txt SCMER
## ../data/SCMER//nGenes_18.txt.cell_135831 ../data/SCMER//nGenes_18.txt SCMER
ggplot(scores_all_df, aes(x = interaction(type, score_n), y = score)) + 
  geom_boxplot(aes(fill = type)) + 
  theme_classic() +
  # scale_fill_manual(values = celltype_colours) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("") +
  theme(legend.position = "bottom") +
  NULL

ggplot(subset(scores_all_df, score_n == "../data/SCMER//nGenes_257.txt"),
              aes(x = interaction(type, celltype), y = score)) + 
  geom_boxplot(aes(fill = celltype)) + 
  theme_classic() +
  scale_fill_manual(values = celltype_colours) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("") +
  theme(legend.position = "bottom") +
  NULL

Which genes mark differences between cells with high ambiguity?

Here I perform a clustering of the data using the best SCMER genes, extract the clusters with the highest uncertainty values, then re-cluster these cells using the entire transcriptome. Presumably, the markers for these re-clusters represent the gene expression differences that are as yet unaccounted for within the SCMER gene set.

SCE_sub <- logNormCounts(SCE[geneList_SCMER[[length(geneList_SCMER)]],])
fit = modelGeneVar(logcounts(SCE_sub))
HVGs = getTopHVGs(fit)
length(HVGs)
## [1] 120
SCE_sub <- runPCA(SCE_sub, subset_row = rownames(SCE_sub))
SCE_sub_corrected <- fastMNN(SCE_sub, batch = SCE_sub$sample)
PCs_sub = reducedDim(SCE_sub_corrected, "corrected")
reducedDim(SCE_sub, "UMAP") <- calculateUMAP(t(PCs_sub))

cl = clusterSNNGraph(SCE_sub_corrected, use.dimred = "corrected")
table(cl)
## cl
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 1290  265  403 1375  807 3095  981 2738  563  796  734  471  200  123  111  657 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
##  163  548   67  141  207   24  863  806   40  132  843  605  118  845  288   97 
##   33   34   35   36   37 
##   27   45  245  249   16
SCE_sub$cluster_SCMER = paste0("SCMER_cluster_", cl)
SCE_sub$score = scores_all_SCMER[[length(scores_all_SCMER)]][colnames(SCE_sub)]

plotUMAP(SCE_sub, colour_by = "cluster_SCMER") + scale_colour_discrete() + ggtitle("Clusters using SCMER genes")

plotUMAP(SCE_sub, colour_by = "score")

plotUMAP(SCE_sub, colour_by = "celltype") + scale_colour_manual(values = celltype_colours)

ggplot(as.data.frame(colData(SCE_sub)), aes(x = cluster_SCMER, y = score)) + 
  geom_boxplot(aes(fill = cluster_SCMER)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  NULL

There are clusters that have a high uncertainty value, while others remain fairly low. This means that for these clusters, there may be additional genes that are needed to better estimate the cell-cell neighbourhoods.

Re-cluster the cells within these high uncertainty clusters using all genes, and extract their marker genes. Visualise these on the full UMAP and examine what genes they are.

clusters_ordered_uncertainty = names(sort(tapply(SCE_sub$score, SCE_sub$cluster_SCMER, median), decreasing = TRUE))
clusters_ordered_uncertainty
##  [1] "SCMER_cluster_2"  "SCMER_cluster_13" "SCMER_cluster_35" "SCMER_cluster_7" 
##  [5] "SCMER_cluster_17" "SCMER_cluster_24" "SCMER_cluster_6"  "SCMER_cluster_3" 
##  [9] "SCMER_cluster_22" "SCMER_cluster_33" "SCMER_cluster_28" "SCMER_cluster_27"
## [13] "SCMER_cluster_21" "SCMER_cluster_14" "SCMER_cluster_26" "SCMER_cluster_4" 
## [17] "SCMER_cluster_11" "SCMER_cluster_23" "SCMER_cluster_31" "SCMER_cluster_29"
## [21] "SCMER_cluster_20" "SCMER_cluster_34" "SCMER_cluster_8"  "SCMER_cluster_1" 
## [25] "SCMER_cluster_10" "SCMER_cluster_16" "SCMER_cluster_30" "SCMER_cluster_5" 
## [29] "SCMER_cluster_36" "SCMER_cluster_19" "SCMER_cluster_25" "SCMER_cluster_9" 
## [33] "SCMER_cluster_18" "SCMER_cluster_12" "SCMER_cluster_32" "SCMER_cluster_37"
## [37] "SCMER_cluster_15"
minFDRs_list = list()

for (i in 1:5) {
  print(i)
  
  SCE_sub$cluster_SCMER_tmp = SCE_sub$cluster_SCMER == clusters_ordered_uncertainty[i]
  g = plotUMAP(SCE_sub, colour_by = "cluster_SCMER_tmp") +
    scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey"), na.value = "grey") +
    ggtitle(paste("Clusters using SCMER genes on SCMER genes UMAP, ",
                  clusters_ordered_uncertainty[i]))
  print(g)
  
  g = plotReducedDim(SCE_sub, dimred = "umap", colour_by = "cluster_SCMER_tmp") +
    scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey"), na.value = "grey") +
    ggtitle(paste("Clusters using SCMER genes on full data UMAP, ",
                  clusters_ordered_uncertainty[i])) +
    coord_fixed()
  print(g)
  
  table(SCE_sub$celltype, SCE_sub$cluster_SCMER_tmp)
  
  # cluster within these cells
  SCE_full = SCE[,colnames(SCE_sub)[SCE_sub$cluster_SCMER_tmp]]
  SCE_full <- logNormCounts(SCE_full)
  fit = modelGeneVar(logcounts(SCE_full), block = SCE_full$sample)
  HVGs = getTopHVGs(fit)
  length(HVGs)
  SCE_full <- runPCA(SCE_full, subset_row = HVGs)
  SCE_full_corrected <- fastMNN(SCE_full, batch = SCE_full$sample)
  SCE_full_clusters = clusterSNNGraph(SCE_full_corrected, use.dimred = "corrected")
  table(SCE_full_clusters)
  SCE_full$full_cluster = SCE_full_clusters
  
  full_cluster_markers_array = getMarkerArray(SCE_full, "full_cluster")
  minFDRs = rowMins(full_cluster_markers_array[,,"FDR"], na.rm = TRUE)
  names(minFDRs) <- dimnames(full_cluster_markers_array)[[1]]
  
  minFDRs_list[[i]] <- minFDRs
  
  sort(minFDRs)[1:5]
  table(minFDRs < 0.01)
  head(minFDRs[minFDRs < 0.01])
  rowData(SCE_full)$SYMBOL[order(minFDRs)[1:5]]
  
  gList = sapply(rowData(SCE_full)$SYMBOL[order(minFDRs)[1:9]], function(gene) {
    plotReducedDim(SCE, dimred = "umap", 
                   colour_by = gene,
                   swap_rownames = "SYMBOL",
                   by_exprs_values = "logcounts") +
      coord_fixed()
  }, simplify = FALSE)
  g = wrap_plots(gList, nrow = 3, ncol = 3)
  print(g)
}
## [1] 1

## [1] "1"
## [1] "2"
## [1] "3"

## [1] 2

## [1] "1"
## [1] "2"
## [1] "3"

## [1] 3

## [1] "1"
## [1] "2"
## [1] "3"

## [1] 4

## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"

## [1] 5

## [1] "1"
## [1] "2"
## [1] "3"

sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-apple-darwin19.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /usr/local/Cellar/openblas/0.3.13/lib/libopenblasp-r0.3.13.dylib
## LAPACK: /usr/local/Cellar/r/4.0.4/lib/R/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] BiocNeighbors_1.8.2         batchelor_1.6.2            
##  [3] igraph_1.2.6                bluster_1.0.0              
##  [5] scran_1.18.3                gtools_3.8.2               
##  [7] patchwork_1.1.1             scater_1.18.3              
##  [9] org.Mm.eg.db_3.12.0         AnnotationDbi_1.52.0       
## [11] reshape_0.8.8               ggplot2_3.3.3              
## [13] MouseGastrulationData_1.4.0 SingleCellExperiment_1.12.0
## [15] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [17] GenomicRanges_1.42.0        GenomeInfoDb_1.26.2        
## [19] IRanges_2.24.1              S4Vectors_0.28.1           
## [21] BiocGenerics_0.36.0         MatrixGenerics_1.2.0       
## [23] matrixStats_0.57.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6                  bit64_4.0.5                  
##  [3] RcppAnnoy_0.0.18              httr_1.4.2                   
##  [5] tools_4.0.4                   ResidualMatrix_1.0.0         
##  [7] R6_2.5.0                      irlba_2.3.3                  
##  [9] vipor_0.4.5                   uwot_0.1.10                  
## [11] DBI_1.1.1                     colorspace_2.0-0             
## [13] withr_2.4.1                   tidyselect_1.1.0             
## [15] gridExtra_2.3                 bit_4.0.4                    
## [17] curl_4.3                      compiler_4.0.4               
## [19] DelayedArray_0.16.1           labeling_0.4.2               
## [21] scales_1.1.1                  rappdirs_0.3.3               
## [23] stringr_1.4.0                 digest_0.6.27                
## [25] rmarkdown_2.6                 XVector_0.30.0               
## [27] pkgconfig_2.0.3               htmltools_0.5.1.1            
## [29] sparseMatrixStats_1.2.0       limma_3.46.0                 
## [31] dbplyr_2.1.0                  fastmap_1.1.0                
## [33] rlang_0.4.10                  RSQLite_2.2.3                
## [35] shiny_1.6.0                   DelayedMatrixStats_1.12.2    
## [37] farver_2.0.3                  generics_0.1.0               
## [39] BiocParallel_1.24.1           dplyr_1.0.3                  
## [41] RCurl_1.98-1.2                magrittr_2.0.1               
## [43] BiocSingular_1.6.0            GenomeInfoDbData_1.2.4       
## [45] scuttle_1.0.4                 Matrix_1.3-2                 
## [47] Rcpp_1.0.6                    ggbeeswarm_0.6.0             
## [49] munsell_0.5.0                 viridis_0.5.1                
## [51] lifecycle_0.2.0               edgeR_3.32.1                 
## [53] stringi_1.5.3                 yaml_2.2.1                   
## [55] zlibbioc_1.36.0               plyr_1.8.6                   
## [57] BiocFileCache_1.14.0          AnnotationHub_2.22.0         
## [59] grid_4.0.4                    blob_1.2.1                   
## [61] dqrng_0.2.1                   promises_1.1.1               
## [63] ExperimentHub_1.16.0          crayon_1.3.4                 
## [65] lattice_0.20-41               cowplot_1.1.1                
## [67] beachmat_2.6.4                locfit_1.5-9.4               
## [69] knitr_1.30                    pillar_1.4.7                 
## [71] codetools_0.2-18              glue_1.4.2                   
## [73] BiocVersion_3.12.0            evaluate_0.14                
## [75] BiocManager_1.30.10           vctrs_0.3.6                  
## [77] httpuv_1.5.5                  gtable_0.3.0                 
## [79] purrr_0.3.4                   assertthat_0.2.1             
## [81] cachem_1.0.1                  xfun_0.20                    
## [83] rsvd_1.0.3                    mime_0.9                     
## [85] xtable_1.8-4                  RSpectra_0.16-0              
## [87] later_1.1.0.1                 viridisLite_0.3.0            
## [89] tibble_3.0.5                  beeswarm_0.2.3               
## [91] memoise_2.0.0                 statmod_1.4.35               
## [93] ellipsis_0.3.1                interactiveDisplayBase_1.28.0